biblicus 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +2 -2
- biblicus/_vendor/dotyaml/loader.py +40 -1
- biblicus/ai/__init__.py +39 -0
- biblicus/ai/embeddings.py +114 -0
- biblicus/ai/llm.py +138 -0
- biblicus/ai/models.py +226 -0
- biblicus/analysis/__init__.py +5 -2
- biblicus/analysis/markov.py +1624 -0
- biblicus/analysis/models.py +754 -1
- biblicus/analysis/topic_modeling.py +98 -19
- biblicus/backends/sqlite_full_text_search.py +4 -2
- biblicus/cli.py +118 -23
- biblicus/recipes.py +136 -0
- biblicus/text/__init__.py +43 -0
- biblicus/text/annotate.py +222 -0
- biblicus/text/extract.py +210 -0
- biblicus/text/link.py +519 -0
- biblicus/text/markup.py +200 -0
- biblicus/text/models.py +319 -0
- biblicus/text/prompts.py +113 -0
- biblicus/text/redact.py +229 -0
- biblicus/text/slice.py +155 -0
- biblicus/text/tool_loop.py +334 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.1.dist-info}/METADATA +98 -28
- {biblicus-0.14.0.dist-info → biblicus-0.15.1.dist-info}/RECORD +30 -15
- biblicus/analysis/llm.py +0 -106
- {biblicus-0.14.0.dist-info → biblicus-0.15.1.dist-info}/WHEEL +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.1.dist-info}/entry_points.txt +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.1.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.1.dist-info}/top_level.txt +0 -0
biblicus/text/link.py
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Agentic text linking using virtual file edits.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from collections import Counter
|
|
9
|
+
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
|
10
|
+
|
|
11
|
+
from jinja2 import Environment, StrictUndefined
|
|
12
|
+
|
|
13
|
+
from .markup import (
|
|
14
|
+
TextAnnotatedSpan,
|
|
15
|
+
build_span_context_section,
|
|
16
|
+
parse_span_markup,
|
|
17
|
+
strip_span_tags,
|
|
18
|
+
)
|
|
19
|
+
from .models import TextLinkRequest, TextLinkResult
|
|
20
|
+
from .tool_loop import request_confirmation, run_tool_loop
|
|
21
|
+
|
|
22
|
+
DEFAULT_LINK_ID_PREFIX = "link_"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def apply_text_link(request: TextLinkRequest) -> TextLinkResult:
|
|
26
|
+
"""
|
|
27
|
+
Apply text linking using a language model.
|
|
28
|
+
|
|
29
|
+
:param request: Text link request.
|
|
30
|
+
:type request: TextLinkRequest
|
|
31
|
+
:return: Text link result.
|
|
32
|
+
:rtype: TextLinkResult
|
|
33
|
+
:raises ValueError: If model output is invalid or text is modified. Empty outputs trigger
|
|
34
|
+
a confirmation round and return a warning when confirmed.
|
|
35
|
+
"""
|
|
36
|
+
warnings: List[str] = []
|
|
37
|
+
system_prompt = _render_system_prompt(
|
|
38
|
+
request.system_prompt,
|
|
39
|
+
id_prefix=request.id_prefix,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
if request.mock_marked_up_text is not None:
|
|
43
|
+
return _build_mock_result(
|
|
44
|
+
request,
|
|
45
|
+
request.mock_marked_up_text,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
result = run_tool_loop(
|
|
49
|
+
text=request.text,
|
|
50
|
+
client=request.client,
|
|
51
|
+
system_prompt=system_prompt,
|
|
52
|
+
prompt_template=request.prompt_template,
|
|
53
|
+
max_rounds=request.max_rounds,
|
|
54
|
+
max_edits_per_round=request.max_edits_per_round,
|
|
55
|
+
apply_str_replace=_apply_link_replace,
|
|
56
|
+
validate_text=lambda current_text: _validate_link_markup(current_text, request.id_prefix),
|
|
57
|
+
build_retry_message=lambda errors, current_text: _build_retry_message(
|
|
58
|
+
errors, current_text, request.id_prefix
|
|
59
|
+
),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
if not result.done:
|
|
63
|
+
if result.last_error:
|
|
64
|
+
recovered = _attempt_missing_coverage_recovery(
|
|
65
|
+
marked_up_text=result.text,
|
|
66
|
+
id_prefix=request.id_prefix,
|
|
67
|
+
warnings=warnings,
|
|
68
|
+
)
|
|
69
|
+
if recovered is not None:
|
|
70
|
+
return recovered
|
|
71
|
+
raise ValueError(f"Text link failed: {result.last_error}")
|
|
72
|
+
warnings.append("Text link reached max rounds without done=true")
|
|
73
|
+
|
|
74
|
+
if result.text == request.text:
|
|
75
|
+
if result.last_error:
|
|
76
|
+
raise ValueError(result.last_error)
|
|
77
|
+
confirmation = request_confirmation(
|
|
78
|
+
result=result,
|
|
79
|
+
text=result.text,
|
|
80
|
+
client=request.client,
|
|
81
|
+
system_prompt=system_prompt,
|
|
82
|
+
prompt_template=request.prompt_template,
|
|
83
|
+
max_rounds=2,
|
|
84
|
+
max_edits_per_round=request.max_edits_per_round,
|
|
85
|
+
apply_str_replace=_apply_link_replace,
|
|
86
|
+
confirmation_message=_build_empty_confirmation_message(result.text),
|
|
87
|
+
validate_text=lambda current_text: _validate_link_markup(
|
|
88
|
+
current_text, request.id_prefix
|
|
89
|
+
),
|
|
90
|
+
build_retry_message=lambda errors, current_text: _build_retry_message(
|
|
91
|
+
errors, current_text, request.id_prefix
|
|
92
|
+
),
|
|
93
|
+
)
|
|
94
|
+
if not confirmation.done:
|
|
95
|
+
if confirmation.last_error:
|
|
96
|
+
raise ValueError(f"Text link failed: {confirmation.last_error}")
|
|
97
|
+
warnings.append("Text link confirmation reached max rounds without done=true")
|
|
98
|
+
_validate_preserved_text(original=request.text, marked_up=confirmation.text)
|
|
99
|
+
if confirmation.text == request.text:
|
|
100
|
+
warnings.append("Text link returned no spans; model confirmed empty result")
|
|
101
|
+
return TextLinkResult(marked_up_text=confirmation.text, spans=[], warnings=warnings)
|
|
102
|
+
spans = parse_span_markup(confirmation.text)
|
|
103
|
+
errors = _validate_link_spans(spans, request.id_prefix)
|
|
104
|
+
if errors:
|
|
105
|
+
raise ValueError("; ".join(errors))
|
|
106
|
+
return TextLinkResult(marked_up_text=confirmation.text, spans=spans, warnings=warnings)
|
|
107
|
+
|
|
108
|
+
_validate_preserved_text(original=request.text, marked_up=result.text)
|
|
109
|
+
spans = parse_span_markup(result.text)
|
|
110
|
+
errors = _validate_link_spans(spans, request.id_prefix)
|
|
111
|
+
if errors:
|
|
112
|
+
autofilled = _autofill_ref_spans(result.text, spans)
|
|
113
|
+
if autofilled is not None:
|
|
114
|
+
result_text, spans, autofill_warnings = autofilled
|
|
115
|
+
warnings.extend(autofill_warnings)
|
|
116
|
+
errors = _validate_link_spans(spans, request.id_prefix)
|
|
117
|
+
if errors:
|
|
118
|
+
raise ValueError("; ".join(errors))
|
|
119
|
+
return TextLinkResult(
|
|
120
|
+
marked_up_text=result_text,
|
|
121
|
+
spans=spans,
|
|
122
|
+
warnings=warnings,
|
|
123
|
+
)
|
|
124
|
+
raise ValueError("; ".join(errors))
|
|
125
|
+
|
|
126
|
+
return TextLinkResult(marked_up_text=result.text, spans=spans, warnings=warnings)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _build_mock_result(
|
|
130
|
+
request: TextLinkRequest,
|
|
131
|
+
marked_up_text: str,
|
|
132
|
+
) -> TextLinkResult:
|
|
133
|
+
if marked_up_text == request.text:
|
|
134
|
+
raise ValueError("Text link produced no spans")
|
|
135
|
+
_validate_preserved_text(original=request.text, marked_up=marked_up_text)
|
|
136
|
+
spans = parse_span_markup(marked_up_text)
|
|
137
|
+
errors = _validate_link_spans(spans, request.id_prefix)
|
|
138
|
+
if errors:
|
|
139
|
+
raise ValueError("; ".join(errors))
|
|
140
|
+
return TextLinkResult(marked_up_text=marked_up_text, spans=spans, warnings=[])
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _render_system_prompt(template: str, *, id_prefix: str) -> str:
|
|
144
|
+
env = Environment(undefined=StrictUndefined)
|
|
145
|
+
rendered = env.from_string(template).render(
|
|
146
|
+
id_prefix=id_prefix,
|
|
147
|
+
)
|
|
148
|
+
return rendered
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _apply_link_replace(text: str, old_str: str, new_str: str) -> str:
|
|
152
|
+
occurrences = text.count(old_str)
|
|
153
|
+
if occurrences == 0:
|
|
154
|
+
raise ValueError("Text link replacement old_str not found")
|
|
155
|
+
if occurrences > 1:
|
|
156
|
+
raise ValueError("Text link replacement old_str is not unique")
|
|
157
|
+
_validate_replace_text(old_str, new_str)
|
|
158
|
+
return text.replace(old_str, new_str, 1)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _validate_replace_text(old_str: str, new_str: str) -> None:
|
|
162
|
+
if strip_span_tags(old_str) != strip_span_tags(new_str):
|
|
163
|
+
raise ValueError("Text link replacements may only insert span tags")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _validate_preserved_text(*, original: str, marked_up: str) -> None:
|
|
167
|
+
if strip_span_tags(marked_up) != original:
|
|
168
|
+
raise ValueError("Text link edits modified the source text")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _validate_link_markup(marked_up_text: str, id_prefix: str) -> List[str]:
|
|
172
|
+
try:
|
|
173
|
+
spans = parse_span_markup(marked_up_text)
|
|
174
|
+
except ValueError as exc:
|
|
175
|
+
return [str(exc)]
|
|
176
|
+
errors = _validate_link_spans(spans, id_prefix)
|
|
177
|
+
errors.extend(_validate_link_coverage(marked_up_text, spans))
|
|
178
|
+
errors.extend(_validate_link_span_minimality(spans))
|
|
179
|
+
return errors
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _attempt_missing_coverage_recovery(
|
|
183
|
+
*,
|
|
184
|
+
marked_up_text: str,
|
|
185
|
+
id_prefix: str,
|
|
186
|
+
warnings: List[str],
|
|
187
|
+
) -> Optional[TextLinkResult]:
|
|
188
|
+
try:
|
|
189
|
+
spans = parse_span_markup(marked_up_text)
|
|
190
|
+
except ValueError:
|
|
191
|
+
return None
|
|
192
|
+
plain_text = strip_span_tags(marked_up_text)
|
|
193
|
+
promoted_spans, promotion_warnings = _promote_ref_spans_to_id(
|
|
194
|
+
spans=spans,
|
|
195
|
+
id_prefix=id_prefix,
|
|
196
|
+
)
|
|
197
|
+
if promotion_warnings:
|
|
198
|
+
warnings.extend(promotion_warnings)
|
|
199
|
+
spans = promoted_spans
|
|
200
|
+
marked_up_text = _render_span_markup(plain_text, spans)
|
|
201
|
+
errors = _validate_link_spans(spans, id_prefix)
|
|
202
|
+
errors.extend(_validate_link_coverage(marked_up_text, spans))
|
|
203
|
+
errors.extend(_validate_link_span_minimality(spans))
|
|
204
|
+
if not errors or not _errors_are_missing_coverage_only(errors):
|
|
205
|
+
return None
|
|
206
|
+
autofilled = _autofill_ref_spans(marked_up_text, spans)
|
|
207
|
+
if autofilled is None:
|
|
208
|
+
return None
|
|
209
|
+
result_text, result_spans, autofill_warnings = autofilled
|
|
210
|
+
warnings.extend(autofill_warnings)
|
|
211
|
+
errors_after = _validate_link_spans(result_spans, id_prefix)
|
|
212
|
+
errors_after.extend(_validate_link_coverage(result_text, result_spans))
|
|
213
|
+
errors_after.extend(_validate_link_span_minimality(result_spans))
|
|
214
|
+
if errors_after:
|
|
215
|
+
return None
|
|
216
|
+
return TextLinkResult(
|
|
217
|
+
marked_up_text=result_text,
|
|
218
|
+
spans=result_spans,
|
|
219
|
+
warnings=warnings,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _errors_are_missing_coverage_only(errors: Sequence[str]) -> bool:
|
|
224
|
+
return all(_is_ref_coverage_error(error) for error in errors)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _is_ref_coverage_error(error: str) -> bool:
|
|
228
|
+
if error.startswith("Missing linked spans for repeated text"):
|
|
229
|
+
return True
|
|
230
|
+
if error.startswith("Id '") and error.endswith("must have at least one ref span"):
|
|
231
|
+
return True
|
|
232
|
+
if error.startswith("Repeated text '") and error.endswith("must include ref spans for repeats"):
|
|
233
|
+
return True
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _promote_ref_spans_to_id(
|
|
238
|
+
*,
|
|
239
|
+
spans: Sequence[TextAnnotatedSpan],
|
|
240
|
+
id_prefix: str,
|
|
241
|
+
) -> Tuple[List[TextAnnotatedSpan], List[str]]:
|
|
242
|
+
spans_by_text: Dict[str, List[TextAnnotatedSpan]] = {}
|
|
243
|
+
for span in spans:
|
|
244
|
+
spans_by_text.setdefault(span.text, []).append(span)
|
|
245
|
+
promote_indices: Dict[int, str] = {}
|
|
246
|
+
warnings: List[str] = []
|
|
247
|
+
for span_text, group in spans_by_text.items():
|
|
248
|
+
if any("id" in span.attributes for span in group):
|
|
249
|
+
continue
|
|
250
|
+
ref_spans = [span for span in group if "ref" in span.attributes]
|
|
251
|
+
if not ref_spans:
|
|
252
|
+
continue
|
|
253
|
+
candidate = sorted(ref_spans, key=lambda span: (span.start_char, span.end_char))[0]
|
|
254
|
+
ref_value = candidate.attributes.get("ref", "")
|
|
255
|
+
if not ref_value.startswith(id_prefix):
|
|
256
|
+
continue
|
|
257
|
+
promote_indices[candidate.index] = ref_value
|
|
258
|
+
warnings.append(f"Promoted ref span for '{span_text}' to id '{ref_value}'.")
|
|
259
|
+
if not promote_indices:
|
|
260
|
+
return list(spans), []
|
|
261
|
+
updated: List[TextAnnotatedSpan] = []
|
|
262
|
+
for span in spans:
|
|
263
|
+
attributes = span.attributes
|
|
264
|
+
if span.index in promote_indices:
|
|
265
|
+
attributes = {"id": promote_indices[span.index]}
|
|
266
|
+
updated.append(
|
|
267
|
+
TextAnnotatedSpan(
|
|
268
|
+
index=span.index,
|
|
269
|
+
start_char=span.start_char,
|
|
270
|
+
end_char=span.end_char,
|
|
271
|
+
text=span.text,
|
|
272
|
+
attributes=attributes,
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
return updated, warnings
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _validate_link_spans(spans: Iterable[TextAnnotatedSpan], id_prefix: str) -> List[str]:
|
|
279
|
+
errors: List[str] = []
|
|
280
|
+
seen_ids: List[str] = []
|
|
281
|
+
id_counts: Dict[str, int] = {}
|
|
282
|
+
ref_counts: Dict[str, int] = {}
|
|
283
|
+
span_texts_by_id: Dict[str, Dict[str, set[str]]] = {}
|
|
284
|
+
spans_by_text: Dict[str, List[TextAnnotatedSpan]] = {}
|
|
285
|
+
for span in spans:
|
|
286
|
+
if len(span.attributes) != 1:
|
|
287
|
+
errors.append(f"Span {span.index} must include exactly one attribute")
|
|
288
|
+
continue
|
|
289
|
+
name, value = next(iter(span.attributes.items()))
|
|
290
|
+
if value.strip() == "":
|
|
291
|
+
errors.append(f"Span {span.index} has an empty value for attribute '{name}'")
|
|
292
|
+
continue
|
|
293
|
+
if name == "id":
|
|
294
|
+
if not value.startswith(id_prefix):
|
|
295
|
+
errors.append(f"Span {span.index} id '{value}' must start with '{id_prefix}'")
|
|
296
|
+
if value in seen_ids:
|
|
297
|
+
errors.append(f"Span {span.index} uses duplicate id '{value}'")
|
|
298
|
+
seen_ids.append(value)
|
|
299
|
+
id_counts[value] = id_counts.get(value, 0) + 1
|
|
300
|
+
span_texts_by_id.setdefault(value, {"id": set(), "ref": set()})["id"].add(span.text)
|
|
301
|
+
elif name == "ref":
|
|
302
|
+
if value not in seen_ids:
|
|
303
|
+
errors.append(f"Span {span.index} ref '{value}' does not match a previous id")
|
|
304
|
+
ref_counts[value] = ref_counts.get(value, 0) + 1
|
|
305
|
+
span_texts_by_id.setdefault(value, {"id": set(), "ref": set()})["ref"].add(span.text)
|
|
306
|
+
else:
|
|
307
|
+
errors.append(
|
|
308
|
+
f"Span {span.index} uses attribute '{name}' but only 'id' or 'ref' are allowed"
|
|
309
|
+
)
|
|
310
|
+
spans_by_text.setdefault(span.text, []).append(span)
|
|
311
|
+
for id_value, text_sets in span_texts_by_id.items():
|
|
312
|
+
id_texts = text_sets["id"]
|
|
313
|
+
ref_texts = text_sets["ref"]
|
|
314
|
+
if len(id_texts) > 1:
|
|
315
|
+
errors.append(f"Id '{id_value}' spans must wrap the same text")
|
|
316
|
+
if len(ref_texts) > 1:
|
|
317
|
+
errors.append(f"Ref spans for id '{id_value}' must wrap the same text")
|
|
318
|
+
if id_texts and ref_texts and id_texts != ref_texts:
|
|
319
|
+
id_text = next(iter(id_texts))
|
|
320
|
+
ref_text = next(iter(ref_texts))
|
|
321
|
+
errors.append(
|
|
322
|
+
f"Id '{id_value}' span text must match ref span text "
|
|
323
|
+
f"(id: '{id_text}', ref: '{ref_text}')"
|
|
324
|
+
)
|
|
325
|
+
for span_text, span_group in spans_by_text.items():
|
|
326
|
+
if len(span_group) <= 1:
|
|
327
|
+
continue
|
|
328
|
+
id_values = [span.attributes.get("id") for span in span_group if "id" in span.attributes]
|
|
329
|
+
ref_values = [span.attributes.get("ref") for span in span_group if "ref" in span.attributes]
|
|
330
|
+
if len(id_values) != 1:
|
|
331
|
+
errors.append(f"Repeated text '{span_text}' must have exactly one id span")
|
|
332
|
+
continue
|
|
333
|
+
id_value = id_values[0]
|
|
334
|
+
if not ref_values:
|
|
335
|
+
errors.append(f"Repeated text '{span_text}' must include ref spans for repeats")
|
|
336
|
+
continue
|
|
337
|
+
if any(ref != id_value for ref in ref_values):
|
|
338
|
+
errors.append(f"Repeated text '{span_text}' refs must match id '{id_value}'")
|
|
339
|
+
for id_value, count in id_counts.items():
|
|
340
|
+
if count > 1:
|
|
341
|
+
continue
|
|
342
|
+
if ref_counts.get(id_value, 0) == 0:
|
|
343
|
+
errors.append(f"Id '{id_value}' must have at least one ref span")
|
|
344
|
+
return errors
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _validate_link_coverage(marked_up_text: str, spans: Iterable[TextAnnotatedSpan]) -> List[str]:
|
|
348
|
+
plain_text = strip_span_tags(marked_up_text)
|
|
349
|
+
spans_by_text: Dict[str, int] = {}
|
|
350
|
+
for span in spans:
|
|
351
|
+
if span.text:
|
|
352
|
+
spans_by_text[span.text] = spans_by_text.get(span.text, 0) + 1
|
|
353
|
+
errors: List[str] = []
|
|
354
|
+
for span_text, span_count in spans_by_text.items():
|
|
355
|
+
occurrences = plain_text.count(span_text)
|
|
356
|
+
if occurrences > span_count:
|
|
357
|
+
errors.append(
|
|
358
|
+
f"Missing linked spans for repeated text '{span_text}' "
|
|
359
|
+
f"({span_count}/{occurrences})"
|
|
360
|
+
)
|
|
361
|
+
return errors
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
_WORD_PATTERN = re.compile(r"[A-Za-z0-9_]+")
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _validate_link_span_minimality(spans: Iterable[TextAnnotatedSpan]) -> List[str]:
|
|
368
|
+
errors: List[str] = []
|
|
369
|
+
for span in spans:
|
|
370
|
+
tokens = [token for token in _WORD_PATTERN.findall(span.text) if token]
|
|
371
|
+
if not tokens:
|
|
372
|
+
continue
|
|
373
|
+
counts = Counter(tokens)
|
|
374
|
+
repeated = [token for token, count in counts.items() if count > 1]
|
|
375
|
+
if repeated:
|
|
376
|
+
errors.append(
|
|
377
|
+
f"Span {span.index} contains repeated token '{repeated[0]}'. "
|
|
378
|
+
"Split repeated mentions into separate spans."
|
|
379
|
+
)
|
|
380
|
+
return errors
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _autofill_ref_spans(
|
|
384
|
+
marked_up_text: str,
|
|
385
|
+
spans: Iterable[TextAnnotatedSpan],
|
|
386
|
+
) -> Optional[Tuple[str, List[TextAnnotatedSpan], List[str]]]:
|
|
387
|
+
plain_text = strip_span_tags(marked_up_text)
|
|
388
|
+
existing_spans = list(spans)
|
|
389
|
+
occupied = [(span.start_char, span.end_char) for span in existing_spans]
|
|
390
|
+
new_spans: List[TextAnnotatedSpan] = []
|
|
391
|
+
|
|
392
|
+
def is_covered(start: int, end: int) -> bool:
|
|
393
|
+
return any(start < span_end and end > span_start for span_start, span_end in occupied)
|
|
394
|
+
|
|
395
|
+
for span in existing_spans:
|
|
396
|
+
id_value = span.attributes.get("id")
|
|
397
|
+
if not id_value:
|
|
398
|
+
continue
|
|
399
|
+
span_text = span.text
|
|
400
|
+
if not span_text:
|
|
401
|
+
continue
|
|
402
|
+
for match in re.finditer(re.escape(span_text), plain_text):
|
|
403
|
+
start = match.start()
|
|
404
|
+
end = match.end()
|
|
405
|
+
if is_covered(start, end):
|
|
406
|
+
continue
|
|
407
|
+
ref_span = TextAnnotatedSpan(
|
|
408
|
+
index=1,
|
|
409
|
+
start_char=start,
|
|
410
|
+
end_char=end,
|
|
411
|
+
text=span_text,
|
|
412
|
+
attributes={"ref": id_value},
|
|
413
|
+
)
|
|
414
|
+
new_spans.append(ref_span)
|
|
415
|
+
occupied.append((start, end))
|
|
416
|
+
|
|
417
|
+
if not new_spans:
|
|
418
|
+
return None
|
|
419
|
+
|
|
420
|
+
merged_spans = sorted(
|
|
421
|
+
existing_spans + new_spans,
|
|
422
|
+
key=lambda span: (span.start_char, span.end_char),
|
|
423
|
+
)
|
|
424
|
+
reindexed: List[TextAnnotatedSpan] = []
|
|
425
|
+
for index, span in enumerate(merged_spans, start=1):
|
|
426
|
+
reindexed.append(
|
|
427
|
+
TextAnnotatedSpan(
|
|
428
|
+
index=index,
|
|
429
|
+
start_char=span.start_char,
|
|
430
|
+
end_char=span.end_char,
|
|
431
|
+
text=span.text,
|
|
432
|
+
attributes=span.attributes,
|
|
433
|
+
)
|
|
434
|
+
)
|
|
435
|
+
rendered = _render_span_markup(plain_text, reindexed)
|
|
436
|
+
warnings = [f"Autofilled {len(new_spans)} ref spans for repeated text."]
|
|
437
|
+
return rendered, reindexed, warnings
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def _render_span_markup(text: str, spans: List[TextAnnotatedSpan]) -> str:
|
|
441
|
+
parts: List[str] = []
|
|
442
|
+
cursor = 0
|
|
443
|
+
for span in spans:
|
|
444
|
+
if span.start_char < cursor:
|
|
445
|
+
raise ValueError("Span overlap detected while rendering markup")
|
|
446
|
+
parts.append(text[cursor : span.start_char])
|
|
447
|
+
attrs = " ".join(f'{key}="{value}"' for key, value in span.attributes.items())
|
|
448
|
+
if attrs:
|
|
449
|
+
parts.append(f"<span {attrs}>")
|
|
450
|
+
else:
|
|
451
|
+
parts.append("<span>")
|
|
452
|
+
parts.append(text[span.start_char : span.end_char])
|
|
453
|
+
parts.append("</span>")
|
|
454
|
+
cursor = span.end_char
|
|
455
|
+
parts.append(text[cursor:])
|
|
456
|
+
return "".join(parts)
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _build_retry_message(errors: Sequence[str], current_text: str, id_prefix: str) -> str:
|
|
460
|
+
error_lines = "\n".join(f"- {error}" for error in errors)
|
|
461
|
+
context_section = build_span_context_section(current_text, errors)
|
|
462
|
+
coverage_guidance = _build_coverage_guidance(errors)
|
|
463
|
+
return (
|
|
464
|
+
"Your last edit did not validate.\n"
|
|
465
|
+
"Issues:\n"
|
|
466
|
+
f"{error_lines}\n\n"
|
|
467
|
+
f"{context_section}"
|
|
468
|
+
f"{coverage_guidance}"
|
|
469
|
+
"Please fix the markup using str_replace. Use id for first mentions and ref for repeats. "
|
|
470
|
+
"Reuse the same id for identical names and do not assign multiple ids to the same name. "
|
|
471
|
+
f"Ids must start with '{id_prefix}'. Try again.\n"
|
|
472
|
+
"Current text:\n"
|
|
473
|
+
f"---\n{current_text}\n---"
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def _build_coverage_guidance(errors: Sequence[str]) -> str:
|
|
478
|
+
instructions: List[str] = []
|
|
479
|
+
for error in errors:
|
|
480
|
+
match = re.match(
|
|
481
|
+
r"Missing linked spans for repeated text '(.+)' \((\d+)/(\d+)\)",
|
|
482
|
+
error,
|
|
483
|
+
)
|
|
484
|
+
if match:
|
|
485
|
+
span_text = match.group(1)
|
|
486
|
+
instructions.append(
|
|
487
|
+
f"- Add ref spans for every remaining occurrence of '{span_text}' "
|
|
488
|
+
"using the same id as its first mention."
|
|
489
|
+
)
|
|
490
|
+
continue
|
|
491
|
+
if error.startswith("Id '") and error.endswith("must have at least one ref span"):
|
|
492
|
+
id_value = error.split("'")[1]
|
|
493
|
+
instructions.append(f'- Add ref spans with ref="{id_value}" for each later occurrence.')
|
|
494
|
+
continue
|
|
495
|
+
if error.startswith("Repeated text '") and error.endswith(
|
|
496
|
+
"must include ref spans for repeats"
|
|
497
|
+
):
|
|
498
|
+
span_text = error.split("'")[1]
|
|
499
|
+
instructions.append(
|
|
500
|
+
f"- Ensure '{span_text}' has one id on the first mention and ref spans on later mentions."
|
|
501
|
+
)
|
|
502
|
+
continue
|
|
503
|
+
if error.startswith("Id '") and "span text must match ref span text" in error:
|
|
504
|
+
id_value = error.split("'")[1]
|
|
505
|
+
instructions.append(
|
|
506
|
+
f"- Ensure every span with id/ref '{id_value}' wraps the exact same text."
|
|
507
|
+
)
|
|
508
|
+
if not instructions:
|
|
509
|
+
return ""
|
|
510
|
+
return "Fixes:\n" + "\n".join(instructions) + "\n\n"
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def _build_empty_confirmation_message(text: str) -> str:
|
|
514
|
+
return (
|
|
515
|
+
"No linked spans were inserted. If there are truly no repeated names to link, "
|
|
516
|
+
"call done again without changes. Otherwise insert id/ref spans for the repeated names.\n"
|
|
517
|
+
"Current text:\n"
|
|
518
|
+
f"---\n{text}\n---"
|
|
519
|
+
)
|